{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install opendatasets\n",
    "# !pip install pandas\n",
    "# !pip install numpy\n",
    "# !pip install matplotlib\n",
    "# !pip install scikit-learn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Importing files\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import opendatasets as op\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "# import matplotlib.pyplot as plt\n",
    "# from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import linear_kernel\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# op.download('https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('../tmdb-movie-metadata/tmdb_5000_credits.csv')\n",
    "df1 = pd.read_csv('../tmdb-movie-metadata/tmdb_5000_movies.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns = ['id', 'title', 'cast', 'crew']\n",
    "df1 = df1.merge(df, on = 'id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# C mean vote acorss report\n",
    "C = df1['vote_average'].mean()\n",
    "C"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "M = df1['vote_count'].quantile(0.9)\n",
    "M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "qualify_movies = df1.copy().loc[df1['vote_count']>M]\n",
    "qualify_movies.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## weight_rating \n",
    "def weighted_rating(x, m=M, C=C):\n",
    "    v = x['vote_count']\n",
    "    R = x['vote_average']\n",
    "    # Calculation based on the IMDB formula\n",
    "    return (v/(v+m) * R) + (m/(m+v) * C)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define a new feature 'score' and calculate its value with `weighted_rating()`\n",
    "qualify_movies['score'] = qualify_movies.apply(weighted_rating, axis=1)\n",
    "qualify_movies['score'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "qualify_movies.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ploting graph\n",
    "pop = df1.sort_values('popularity', ascending=False)\n",
    "# pop[['title', 'popularity']]\n",
    "plt.figure(figsize=(12,4))\n",
    "plt.bar(pop['title_x'].head(5), pop['popularity'].head(5))\n",
    "plt.xlabel('Popularity')\n",
    "plt.ylabel('Movie')\n",
    "plt.title('Popular Movie')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Parse the stringified features into their corresponding python objects\n",
    "from ast import literal_eval\n",
    "\n",
    "features = ['cast', 'crew', 'keywords', 'genres']\n",
    "for feature in features:\n",
    "    df1[feature] = df1[feature].apply(literal_eval)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_director(x):\n",
    "    for i in x:\n",
    "        if i['job'] == 'Director':\n",
    "            return i['name']\n",
    "    return np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_list(x):\n",
    "    if isinstance(x, list):\n",
    "        names = [i['name'] for i in x]\n",
    "        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.\n",
    "        if len(names) > 3:\n",
    "            names = names[:3]\n",
    "        return names\n",
    "\n",
    "    #Return empty list in case of missing/malformed data\n",
    "    return []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1['director'] = df1['crew'].apply(get_director)\n",
    "\n",
    "features = ['cast', 'keywords', 'genres']\n",
    "for feature in features:\n",
    "    df1[feature] = df1[feature].apply(get_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1[['title_x', 'cast', 'director', 'keywords', 'genres']].head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to convert all strings to lower case and strip names of spaces\n",
    "def clean_data(x):\n",
    "    if isinstance(x, list):\n",
    "        return [str.lower(i.replace(\" \", \"\")) for i in x]\n",
    "    else:\n",
    "        #Check if director exists. If not, return empty string\n",
    "        if isinstance(x, str):\n",
    "            return str.lower(x.replace(\" \", \"\"))\n",
    "        else:\n",
    "            return ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply clean_data function to your features.\n",
    "features = ['cast', 'keywords', 'director', 'genres']\n",
    "\n",
    "for feature in features:\n",
    "    df1[feature] = df1[feature].apply(clean_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_soup(x):\n",
    "    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])\n",
    "df1['soup'] = df1.apply(create_soup, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4803, 20978)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Using Sklearn\n",
    "# tfidf = TfidfVectorizer(stop_words='english')\n",
    "count = CountVectorizer(stop_words='english')\n",
    "df1['overview'] = df1['overview'].fillna('')\n",
    "# tfidf_matrix = tfidf.fit_transform(df1['overview'])\n",
    "count_matrix = count.fit_transform(df1['overview'])\n",
    "# tfidf_matrix.shape\n",
    "count_matrix.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now we are using cosine similarity scores. but we can also use euclidean distance and Pearson \n",
    "# cosine_similarity_sim = linear_kernel(tfidf_matrix, tfidf_matrix)\n",
    "cosine_similarity_count = cosine_similarity(count_matrix, count_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# reverse the indices and movie titles\n",
    "indices = pd.Series(df1.index, index =df1['title_x']).drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Count Matrix\n",
    "# df1 = df1.reset_index()\n",
    "# indices = pd.Series(df1.index, index=df1['title_x'])\n",
    "\n",
    "indices = pd.Series(df1.index, index=df1['title'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Using TFIDF\n",
    "def get_recommendations(title, cosine_sim=cosine_similarity_count):\n",
    "    try:\n",
    "        # Get the index of the movie that matches the title\n",
    "        idx = indices[title]\n",
    "\n",
    "        # Get the pairwise similarity scores of all movies with that movie\n",
    "        sim_scores = list(enumerate(cosine_sim[idx]))\n",
    "\n",
    "        # Sort the movies based on the similarity scores\n",
    "        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
    "\n",
    "        # Get the scores of the 10 most similar movies\n",
    "        sim_scores = sim_scores[1:11]  # Adjust this number as needed\n",
    "\n",
    "        # Get the titles of the recommended movies\n",
    "        recommended_movies = df1['title'].iloc[sim_scores[0][0]]\n",
    "\n",
    "        return recommended_movies\n",
    "    except KeyError:\n",
    "        return \"Sorry, we don't have enough data about this movie title.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Batman Forever'"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_recommendations('The Dark Knight Rises')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Assuming cosine_similarity_count is a numpy array\n",
    "np.save('cosine_similarity_count.npy', cosine_similarity_count)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Saving the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cosine_similarity_count = np.load('cosine_similarity_count.npy')\n",
    "recommendations = get_recommendations(\"Noice\", cosine_sim=cosine_similarity_count)\n",
    "recommendations\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Batman Forever\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.metrics.pairwise import linear_kernel\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "\n",
    "df1 = pd.read_csv('../tmdb-movie-metadata/tmdb_5000_movies.csv')\n",
    "\n",
    "\n",
    "count = CountVectorizer(stop_words='english')\n",
    "df1['overview'] = df1['overview'].fillna('')\n",
    "count_matrix = count.fit_transform(df1['overview'])\n",
    "\n",
    "\n",
    "cosine_similarity_count = cosine_similarity(count_matrix, count_matrix)\n",
    "\n",
    "indices = pd.Series(df1.index, index =df1['title']).drop_duplicates()\n",
    "\n",
    "# Using TFIDF\n",
    "def get_recommendations(title, cosine_sim=cosine_similarity_count):\n",
    "    try:\n",
    "        # Get the index of the movie that matches the title\n",
    "        idx = indices[title]\n",
    "\n",
    "        # Get the pairwise similarity scores of all movies with that movie\n",
    "        sim_scores = list(enumerate(cosine_sim[idx]))\n",
    "\n",
    "        # Sort the movies based on the similarity scores\n",
    "        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
    "\n",
    "        # Get the scores of the 10 most similar movies\n",
    "        sim_scores = sim_scores[1:2]  # Adjust this number as needed\n",
    "\n",
    "        # Get the titles of the recommended movies\n",
    "        recommended_movies = \n",
    "\n",
    "        return recommended_movies\n",
    "    except KeyError:\n",
    "        return \"Sorry, we don't have enough data about this movie title.\"\n",
    "    \n",
    "\n",
    "# cosine_similarity_count = np.load('cosine_similarity_count.npy')\n",
    "recommendations = get_recommendations('The Dark Knight Rises')\n",
    "print(recommendations[0])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.2"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
